--- redirect_from: - "/03code/code" interact_link: content/03Code/Code.ipynb kernel_name: python3 kernel_path: content/03Code has_widgets: false title: |- Code pagenum: 1 prev_page: url: /01Introduction/intro.html next_page: url: suffix: .ipynb search: data using method encoding removing outliers values variables transformations missing z score categorical power interquartile range made features standardizing normalizing yeo johnosn transfromation imputing cateorical principal component analysis value being grouping represents following dimensionality equipment dataset different methods reduction components variance singular decomposition algorithms sparse inital not where choosing applying reducing feature label results entires imputed next any transformed decimal best imputation respectively set visualizing predicting our accuracy transformation before need distribution plot previous right order preserve pca svd model thus same newly nominal encoder ordinality might models testing xgbregressor reading preprocessing read mongodb database updated daily new autoscout website comment: "***PROGRAMMATICALLY GENERATED, DO NOT EDIT. SEE ORIGINAL FILES IN /content***" ---
Code
import numpy as np
import pandas as pd
from sklearn import decomposition
from pymongo import MongoClient 
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from scipy import stats
from sklearn.preprocessing import PowerTransformer
import numpy as np
from sklearn.manifold import Isomap
import matplotlib.pyplot as plt
from sklearn.metrics import  mean_absolute_percentage_error
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import PowerTransformer
# To use this experimental feature, we need to explicitly ask for it:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.datasets import fetch_california_housing
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
import plotly.graph_objects as go
import plotly.tools as tls
from plotly.offline import plot, iplot, init_notebook_mode
from IPython.core.display import display, HTML
from plotly.subplots import make_subplots
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
init_notebook_mode(connected = True)
config={'showLink': False, 'displayModeBar': False}

num_att = ['First Registration',
 'Mileage',
 'Power(hp)',
 'Displacement']

X_num_att = ['First Registration',
 'Mileage',
 'Power(hp)',
 'Displacement']

cat_att = ['Make', 'Model', 'Body', 'Fuel','Gearing Type']


class InitalCleaning(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, play_df, y = None):
        play_df = play_df.drop(columns = ['ID', 'Loaded_in_DW', 'Model Code'])
        
        #Adjust Column Types
        play_df['Power(hp)'] = pd.to_numeric(play_df['Power(hp)'])
        play_df['Displacement'] = pd.to_numeric(play_df['Displacement'])
        play_df['Mileage'] = pd.to_numeric(play_df['Mileage'])
        play_df['Price'] = pd.to_numeric(play_df['Price'])
        
        #Drop rows with null values
        play_df = play_df[~(play_df['Make'].isna() | play_df['Model'].isna())]
        play_df = play_df[~(play_df['Displacement'].isna() & play_df['Power(hp)'].isna())]
        play_df = play_df[~(play_df["Body"].isna())]
        play_df = play_df[~(play_df["Fuel"].isna())]
        play_df = play_df[~(play_df["Price"].isna())]
        
        #Drop fake ads and update column values
        play_df = play_df[~(play_df['Displacement'] < 900)]
        play_df = play_df[~(play_df['Displacement'] > 70000)]
        play_df = play_df[~((play_df['Power(hp)']>500) & \
                        (~((play_df['Make'] == 'Audi') | (play_df['Make'] == 'BMW')  | (play_df['Make'] == 'Mercedes-Benz'))))]
        play_df = play_df[~((play_df['Power(hp)']<30) & (~(play_df['Fuel'] == 'Electricity')))]
        play_df = play_df[~(play_df['Price']>300000)]
        play_df["Fuel"] = play_df["Fuel"].str.split("/").str[0]
        play_df['Gearing Type'] = play_df['Gearing Type'].replace({np.nan : 'Manual'})

        
        return play_df
    
    
class AdjustEquip(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, play_df, y = None):
        #Keep only 0 and 1 values
        equipment = play_df.iloc[:,9:79]
        equipment = equipment.replace({np.nan: 0})
        equipment = equipment.replace({'1': 1})
        a = set(equipment['Warranty'])
        a.remove(0)
        a.remove(1)
        equipment = equipment.replace(list(a) , 1)
        
        #Convert each column's type to int
        for col in equipment.columns:
            equipment[col] = equipment[col].astype(int)
        play_df.iloc[:,9:79] = equipment

        
        return play_df
    
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, play_df, y = None):
        
        play_df = pd.get_dummies(data = play_df,
                                      columns = ['Make', 'Model', 'Body', 'Fuel','Gearing Type'],
                                      dummy_na = False)
        
        
        return play_df

class RegistrationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, tip = 0):
        self.tip = tip
    def fit(self, X, y=None):
        return self
    def transform(self, play_df, y = None):
        
        #Reset index is used to avoid null values when merging
        play_df.reset_index(drop=True, inplace=True)
        a = date_magic(play_df['First Registration'])
        a.reset_index(drop = True, inplace = True)
        play_df['First Registration'] = a
        
        if self.tip == 0:
             play_df = play_df[(play_df['Make']=='BMW') & (play_df['First Registration'] > 2005) & ( (play_df['Model'].str.startswith('3')) | (play_df['Model'].str.startswith('1'))  ) ]
        elif self.tip == 1:
             play_df = play_df[play_df.isnull().any(axis=1)] 
        return play_df


class IQROutlierRemoval_new(BaseEstimator, TransformerMixin):
    def __init__(self, num_att):
        self.num_att = num_att
    def fit(self, X, y = None):
        return self
    def transform(self, play_df, y = None):
         
#         Q1 = play_df.quantile(0.25)
#         Q3 = play_df.quantile(0.75)
#         IQR = Q3- Q1
#         play_df = play_df[~((play_df < (Q1 - 1.5 * IQR)) |(play_df > (Q3 + 1.5 * IQR))).any(axis=1)]
        return pd.DataFrame(play_df, columns = num_att)


class IQROutlierRemoval(BaseEstimator, TransformerMixin):
    def __init__(self, num_att):
        self.num_att = num_att
    def fit(self, X, y = None):
        return self
    def transform(self, play_df, y = None):
        bmw_num = play_df[self.num_att]
        bmw_cat = play_df.drop(self.num_att, axis = 1)
        Q1 = bmw_num.quantile(0.25)
        Q3 = bmw_num.quantile(0.75)
        IQR = Q3- Q1
        bmw_num = bmw_num[~((bmw_num < (Q1 - 1.5 * IQR)) |(bmw_num > (Q3 + 1.5 * IQR))).any(axis=1)]
        return pd.DataFrame(bmw_num, columns = num_att)
    

class ZScoreOutlierRemoval(BaseEstimator, TransformerMixin):
    def __init__(self, num_att):
        self.num_att = num_att
    def fit(self, X, y = None):
        return self
    def transform(self, play_df, y = None):
        z = np.abs(stats.zscore(play_df, nan_policy='omit'))
        return play_df[(z < 3).all(axis=1)]
    

class StandardScalerIndices(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scaler = StandardScaler()
    def fit(self, X, y=None):
        self.scaler.fit(X)
        return self
    def transform(self, play_df, y = None):     
        return pd.DataFrame(self.scaler.transform(play_df),columns = play_df.columns, index = play_df.index)

    
class PowerTransformerIndices(BaseEstimator, TransformerMixin):
    def __init__(self, method):
        self.method = method
        self.transformer = PowerTransformer(method = self.method)
    def fit(self, X, y=None):
        self.transformer.fit(X)
        return self
    def transform(self, play_df, y = None):
        return pd.DataFrame(self.transformer.transform(play_df),columns = play_df.columns, index = play_df.index)
    
    
class RemovePrice(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y = None):
        return self
    def transform(self, play_df, y = None):
        return play_df.drop('Price', axis = 1)

class IterativeImputerIndices(BaseEstimator, TransformerMixin):
    def __init__(self, estimator):
        self.imputer = IterativeImputer(estimator = estimator)
    def fit(self, X, y=None):
        self.imputer.fit(X)
        return self
    def transform(self, play_df, y = None):
        return pd.DataFrame(self.imputer.transform(play_df),columns = play_df.columns, index = play_df.index)

class JoinTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, BMW_df, new_att):
        self.BMW_df = BMW_df
        self.new_att = new_att
    def fit(self,X,y = None):
        return self
    def transform(self, X, y = None):
        new = self.BMW_df.drop(self.new_att, axis = 1)
        return X.join(new)
        

def date_magic(d):
    months_dict = {'Jan': 1,'Feb':2,'Mar':3,'Apr':4 ,'May':5 ,'Jun':6,'Jul':7,
                      'Aug':8,'Sep':9,'Oct':10 ,'Nov':11,'Dec':12}
    year = []
    month = []
    num_year = []
    for el in d:
        el = str(el)

        split = el.split('-')
        if (len(split)>1):
            month.append(months_dict[split[0]])
            if(int(split[1])>20):
                year.append(int('19' + split[1]))
            else:
                year.append(int('20' + split[1]))
            continue

        split = el.split('/')
        if(len(split)>1):
            month.append(int(split[0]))
            year.append(int(split[1]))
            continue

        month.append(1)
        year.append(int(el))
    month = pd.Series(month)
    year  = pd.Series(year)
    num_year = year + month/12  
    return pd.Series(num_year)


def readData():

    client = MongoClient('mongodb+srv://Martin:Kostadinov@dwprojectcluster.lpqbf.mongodb.net/cars_database?retryWrites=true&w=majority')

    df_cars = pd.DataFrame(list(client.cars_database.cars.find({})))
    df_cars.drop('_id', axis = 1, inplace = True)
    df_cars = df_cars[df_cars['Loaded_in_DW'].eq(False)]
    df_cars.drop_duplicates(subset=['ID'], inplace = True)
    
    return df_cars




recnik = {}
recnik['Method'] = []
recnik['Mean Percentage Error'] = []
recnik['Standard Deviation'] = []
def randomforestCV( max_features, n_estimators, X, y, message, scoring = 'neg_mean_absolute_percentage_error', recnik = recnik):
    rfr = RandomForestRegressor(max_features = max_features, n_estimators = n_estimators, random_state = 2)
    rfr.fit(X,y)
    rfr_scores = cross_val_score(rfr, X, y, scoring = scoring, cv = 5, n_jobs = -1)
    print("Scores:", -rfr_scores)
    print("Mean:", -rfr_scores.mean())
    print("Standard deviation:", rfr_scores.std())
    recnik['Method'].append(message)
    recnik['Mean Percentage Error'].append(-rfr_scores.mean())
    recnik['Standard Deviation'].append(rfr_scores.std())

    
def gridSearch(param_grid, model, X, y):
    grid_search = GridSearchCV(model, param_grid, cv=5,
                           scoring='neg_mean_absolute_percentage_error',
                           return_train_score=True,
                           verbose = 10, n_jobs = -1)
    grid_search.fit(X, y)
    cvres = grid_search.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(-mean_score, params)
    return grid_search

1. Reading the data and inital data preprocessing

The data is being read from a MongoDB database which is being updated daily with new entires from the AutoScout24 website. First we are removing the entires that are missing values for some fields that are mandatory and can not be imputed. Next some of the outliers are removed manually by checking for values that don't make any sense.

The date is being transformed in a continious, decimal value ranging from 1990 to 2020, where the decimal represents the month of the year.

Given the limited computational power, we will reduce the dataset only to BMW cars first registered no earilier than 2005. The other 2 pipelines are used for visualization purposes and selecting the best imputation method respectively.

df_cars = readData()
type0_pipeline = Pipeline([
                        ('initial', InitalCleaning()),
                        ('equipment', AdjustEquip()),
                        ('date', RegistrationTransformer(tip = 0))
                    ])

type1_pipeline = Pipeline([
                        ('initial', InitalCleaning()),
                        ('equipment', AdjustEquip()),
                        ('date', RegistrationTransformer(tip = 1)),
                        ('encoder', CategoricalEncoder())
                    ])

no_category = Pipeline([
                        ('initial', InitalCleaning()),
                        ('equipment', AdjustEquip()),
                        ('date', RegistrationTransformer(tip = 0)),
                    ])


BMW_df = type0_pipeline.fit_transform(df_cars)
nan_df = type1_pipeline.fit_transform(df_cars)
viz_df = no_category.fit_transform(df_cars)

2. Spliting the dataset to test and train set

BMW_df = BMW_df.reset_index()
BMW_df = BMW_df.drop('index', axis = 1)
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(BMW_df, BMW_df["Model"]):
    strat_train_set = BMW_df.loc[train_index]
    strat_test_set = BMW_df.loc[test_index]

y_train = strat_train_set['Price']
x_train = strat_train_set.drop('Price', axis = 1)
y_test = strat_test_set['Price']
x_test = strat_test_set.drop('Price', axis = 1)

3. Visualizing the data

import pandas_profiling as pp

from plotly.offline import plot, iplot, init_notebook_mode
from IPython.core.display import display, HTML
init_notebook_mode(connected = True)
config={'showLink': False, 'displayModeBar': False}


pp.ProfileReport(viz_df.iloc[:,np.r_[0:10]]).to_file('aa.html')




4. Choosing the best method of imputation

In the following part we are comparing different methods of predicting the missing values in our dataset. We are evaluating the imputed values by seeing how it affects the accuracy in predicting the final value, which is the price.

N_SPLITS = 5

rng = np.random.RandomState(0)
X = nan_df.drop(columns = ['Price'])
y = nan_df['Price']
X = X.to_numpy()
y = y.to_numpy()
score_simple_imputer = pd.DataFrame()
br_estimator = BayesianRidge()
for strategy in ('mean', 'median'):
    estimator = make_pipeline(
        SimpleImputer(missing_values=np.nan, strategy=strategy),
        br_estimator
    )
    score_simple_imputer[strategy] = cross_val_score(
        estimator, X, y, scoring='neg_mean_squared_error',
        cv=N_SPLITS
    )
score_simple_imputer
mean median
0 -5.113854e+08 -5.094263e+08
1 -6.983216e+07 -6.792084e+07
2 -7.762901e+07 -7.609985e+07
3 -5.058913e+07 -4.866475e+07
4 -2.984431e+07 -2.770213e+07
X = X[::5]
y = y[::5]
estimators = [
    DecisionTreeRegressor(max_features='sqrt', random_state=0),
    ExtraTreesRegressor(n_estimators=10, random_state=0),
    KNeighborsRegressor(n_neighbors=15),
    BayesianRidge()
]
score_iterative_imputer = pd.DataFrame()
for impute_estimator in estimators:
    estimator = make_pipeline(
        IterativeImputer(random_state=0, estimator=impute_estimator),
        br_estimator
    )
    
    score_iterative_imputer[impute_estimator.__class__.__name__] = \
        cross_val_score(
            estimator, X, y, scoring='neg_mean_squared_error', verbose = 2,
            cv=N_SPLITS
        )
score_iterative_imputer
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
E:\Users\Filip\Anaconda\lib\site-packages\sklearn\impute\_iterative.py:686: ConvergenceWarning:

[IterativeImputer] Early stopping criterion not reached.

[CV] END .................................................... total time=  27.9s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   27.9s remaining:    0.0s
E:\Users\Filip\Anaconda\lib\site-packages\sklearn\impute\_iterative.py:686: ConvergenceWarning:

[IterativeImputer] Early stopping criterion not reached.

[CV] END .................................................... total time=  28.0s
E:\Users\Filip\Anaconda\lib\site-packages\sklearn\impute\_iterative.py:686: ConvergenceWarning:

[IterativeImputer] Early stopping criterion not reached.

[CV] END .................................................... total time=  28.2s
E:\Users\Filip\Anaconda\lib\site-packages\sklearn\impute\_iterative.py:686: ConvergenceWarning:

[IterativeImputer] Early stopping criterion not reached.

[CV] END .................................................... total time=  28.5s
E:\Users\Filip\Anaconda\lib\site-packages\sklearn\impute\_iterative.py:686: ConvergenceWarning:

[IterativeImputer] Early stopping criterion not reached.

[CV] END .................................................... total time=  29.6s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.4min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
E:\Users\Filip\Anaconda\lib\site-packages\sklearn\impute\_iterative.py:686: ConvergenceWarning:

[IterativeImputer] Early stopping criterion not reached.

[CV] END .................................................... total time= 5.0min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.0min remaining:    0.0s
E:\Users\Filip\Anaconda\lib\site-packages\sklearn\impute\_iterative.py:686: ConvergenceWarning:

[IterativeImputer] Early stopping criterion not reached.

[CV] END .................................................... total time= 5.3min
E:\Users\Filip\Anaconda\lib\site-packages\sklearn\impute\_iterative.py:686: ConvergenceWarning:

[IterativeImputer] Early stopping criterion not reached.

[CV] END .................................................... total time= 5.1min
E:\Users\Filip\Anaconda\lib\site-packages\sklearn\impute\_iterative.py:686: ConvergenceWarning:

[IterativeImputer] Early stopping criterion not reached.

[CV] END .................................................... total time= 6.8min
E:\Users\Filip\Anaconda\lib\site-packages\sklearn\impute\_iterative.py:686: ConvergenceWarning:

[IterativeImputer] Early stopping criterion not reached.

[CV] END .................................................... total time= 8.0min
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 30.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
E:\Users\Filip\Anaconda\lib\site-packages\sklearn\model_selection\_validation.py:614: FitFailedWarning:

Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\pipeline.py", line 341, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\pipeline.py", line 307, in _fit
    **fit_params_steps[name])
  File "E:\Users\Filip\Anaconda\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\pipeline.py", line 754, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\impute\_iterative.py", line 659, in fit_transform
    estimator=None, fit_mode=True)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\impute\_iterative.py", line 306, in _impute_one_feature
    ~missing_row_mask)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\utils\__init__.py", line 344, in _safe_indexing
    return _array_indexing(X, indices, indices_dtype, axis=axis)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\utils\__init__.py", line 179, in _array_indexing
    return array[key] if axis == 0 else array[:, key]
MemoryError: Unable to allocate 2.79 MiB for an array with shape (1224, 299) and data type float64


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.8s remaining:    0.0s
[CV] END .................................................... total time=  13.8s
E:\Users\Filip\Anaconda\lib\site-packages\sklearn\model_selection\_validation.py:614: FitFailedWarning:

Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\pipeline.py", line 341, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\pipeline.py", line 307, in _fit
    **fit_params_steps[name])
  File "E:\Users\Filip\Anaconda\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\pipeline.py", line 754, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\impute\_iterative.py", line 659, in fit_transform
    estimator=None, fit_mode=True)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\impute\_iterative.py", line 306, in _impute_one_feature
    ~missing_row_mask)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\utils\__init__.py", line 344, in _safe_indexing
    return _array_indexing(X, indices, indices_dtype, axis=axis)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\utils\__init__.py", line 179, in _array_indexing
    return array[key] if axis == 0 else array[:, key]
MemoryError: Unable to allocate 2.79 MiB for an array with shape (1225, 299) and data type float64


[CV] END .................................................... total time=  16.2s
E:\Users\Filip\Anaconda\lib\site-packages\sklearn\model_selection\_validation.py:614: FitFailedWarning:

Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\pipeline.py", line 341, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\pipeline.py", line 307, in _fit
    **fit_params_steps[name])
  File "E:\Users\Filip\Anaconda\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\pipeline.py", line 754, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\impute\_iterative.py", line 659, in fit_transform
    estimator=None, fit_mode=True)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\impute\_iterative.py", line 306, in _impute_one_feature
    ~missing_row_mask)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\utils\__init__.py", line 344, in _safe_indexing
    return _array_indexing(X, indices, indices_dtype, axis=axis)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\utils\__init__.py", line 179, in _array_indexing
    return array[key] if axis == 0 else array[:, key]
MemoryError: Unable to allocate 2.79 MiB for an array with shape (1225, 299) and data type float64


[CV] END .................................................... total time=  13.2s
E:\Users\Filip\Anaconda\lib\site-packages\sklearn\model_selection\_validation.py:614: FitFailedWarning:

Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\pipeline.py", line 341, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\pipeline.py", line 307, in _fit
    **fit_params_steps[name])
  File "E:\Users\Filip\Anaconda\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\pipeline.py", line 754, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\impute\_iterative.py", line 659, in fit_transform
    estimator=None, fit_mode=True)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\impute\_iterative.py", line 306, in _impute_one_feature
    ~missing_row_mask)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\utils\__init__.py", line 344, in _safe_indexing
    return _array_indexing(X, indices, indices_dtype, axis=axis)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\utils\__init__.py", line 179, in _array_indexing
    return array[key] if axis == 0 else array[:, key]
MemoryError: Unable to allocate 2.79 MiB for an array with shape (1225, 299) and data type float64


[CV] END .................................................... total time=  12.7s
E:\Users\Filip\Anaconda\lib\site-packages\sklearn\model_selection\_validation.py:614: FitFailedWarning:

Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\pipeline.py", line 341, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\pipeline.py", line 307, in _fit
    **fit_params_steps[name])
  File "E:\Users\Filip\Anaconda\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\pipeline.py", line 754, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\impute\_iterative.py", line 659, in fit_transform
    estimator=None, fit_mode=True)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\impute\_iterative.py", line 306, in _impute_one_feature
    ~missing_row_mask)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\utils\__init__.py", line 344, in _safe_indexing
    return _array_indexing(X, indices, indices_dtype, axis=axis)
  File "E:\Users\Filip\Anaconda\lib\site-packages\sklearn\utils\__init__.py", line 179, in _array_indexing
    return array[key] if axis == 0 else array[:, key]
MemoryError: Unable to allocate 2.79 MiB for an array with shape (1225, 299) and data type float64


[CV] END .................................................... total time=  13.1s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] END .................................................... total time= 3.5min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.5min remaining:    0.0s
[CV] END .................................................... total time= 5.8min
[CV] END .................................................... total time= 4.5min
[CV] END .................................................... total time= 4.4min
[CV] END .................................................... total time= 6.4min
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 24.5min finished
DecisionTreeRegressor ExtraTreesRegressor KNeighborsRegressor BayesianRidge
0 -3.487158e+08 -3.373804e+08 NaN -3.773402e+08
1 -6.291714e+07 -5.908338e+07 NaN -6.276342e+07
2 -8.485837e+07 -8.184054e+07 NaN -9.859064e+07
3 -6.155100e+07 -5.301265e+07 NaN -5.853387e+07
4 -3.570416e+07 -2.946943e+07 NaN -5.408432e+07
scores = pd.concat(
    [score_simple_imputer, score_iterative_imputer],
    keys=['SimpleImputer', 'IterativeImputer'], axis=1
)

# plot california housing results
fig, ax = plt.subplots(figsize=(13, 6))
means = -scores.mean()
errors = scores.std()
means.plot.barh(xerr=errors, ax=ax)
ax.set_title('Car Price Regression with Different Imputation Methods')
ax.set_xlabel('MSE (smaller is better)')
ax.set_yticks(np.arange(means.shape[0]))
ax.set_yticklabels([" w/ ".join(label) for label in means.index.tolist()])
plt.tight_layout(pad=1)
plt.show()

5. Data Transformation

The following transformation consists of removing the outliers using 2 of the most widely known methods, the Interquartile range and Z Score method. Before applying the Z Score Outlier Removal we firstly need to scale the numeric variables of the data and bring them to have Gaussian(Normal) distribution. Lastly the categorical variables are being encoded.

from sklearn.preprocessing import PowerTransformer

final_pipeline = Pipeline([
                        ('IQR_removal',IQROutlierRemoval(num_att = num_att)),
                        ('std_scaler', StandardScalerIndices()),
                        ('yeo-johnson',PowerTransformerIndices(method='yeo-johnson')),
                        ('z_score_removal', ZScoreOutlierRemoval(num_att = num_att)),
                        ('Imputer', IterativeImputerIndices(estimator = ExtraTreesRegressor(n_estimators=10, random_state=0))),
                        ('join', JoinTransformer(x_train, num_att)), #kategoriski so numericki
                        ('encoder', CategoricalEncoder()),
                        

                        ])


X_train_prepared = pd.DataFrame(final_pipeline.fit_transform(x_train))

joined = X_train_prepared.join(y_train)
y_train = joined['Price']

6. Visualizing transformed data

The following plot represents a set of histograms, each representing the distribution of the numerical variables after doing all the transformations in the previous step

fig = go.Figure()


fig.add_trace(
    go.Histogram(x=X_train_prepared['Mileage'], name='Mileage', visible = True)
)
fig.add_trace(
    go.Histogram(x=X_train_prepared['First Registration'], name='First Registration', visible = False)
)
fig.add_trace(
    go.Histogram(x=X_train_prepared['Power(hp)'], name='Power(hp)', visible = False)
)
fig.add_trace(
    go.Histogram(x=X_train_prepared['Displacement'], name='Displacement', visible = False)
)
fig.update_layout(
    updatemenus=[
        dict(
            active=0,
            buttons=list([
                dict(label="Mileage",
                     method="update",
                     args=[{"visible": [True,False,False,False]},
                           {"title": "Mileage"
                            }]),
                dict(label="First Registration",
                     method="update",
                     args=[{"visible": [False,True,False,False]},
                           {"title": "First Registration"}]),
                dict(label="Power(hp)",
                     method="update",
                     args=[{"visible": [False,False,True,False]},
                           {"title": "Power(hp)"}]),
                dict(label="Displacement",
                     method="update",
                     args=[{"visible": [False,False,False,True]},
                           {"title": "Displacement"}]),
               
            ]),
                       
        )
    ])

plot(fig, filename = 'fig.html', config = config)
display(HTML('fig.html'))

7. Dimensionality Reduction

7.1 Choosing the right number of components

Next, we are trying different methods of reducing the dimensionality of the data. In order to maximize the accuracy, one of the vital things to keep in mind is to preserve the inital variance of the data. For this, we are plotting the preserved variance against the number of components, for a Principal Component Analyisis(PCA) and Singular Value Decomposition(SVD) model respectively.

pca = decomposition.PCA()
pca.n_components = 121
pca_data = pca.fit_transform(X_train_prepared)
percentage_var_explained = pca.explained_variance_ / np.sum(pca.explained_variance_)
cum_var_explained = np.cumsum(percentage_var_explained)

svd = TruncatedSVD()
svd.n_components = 121
svd_data = svd.fit_transform(X_train_prepared)
percentage_var_explained_svd = svd.explained_variance_ / np.sum(svd.explained_variance_)
cum_var_explained_svd = np.cumsum(percentage_var_explained_svd)


fig = go.Figure()
fig.add_trace(go.Scatter(y = cum_var_explained,  name = 'PCA'))
fig.add_trace(go.Scatter(y = cum_var_explained_svd,  name = 'SVD'))

fig.update_layout(plot_bgcolor='rgb(255,255,255)',xaxis_title="n_components",
    yaxis_title='Variance')
fig.update_xaxes(ticks = 'outside', showline=True, linecolor='black')
fig.update_yaxes(ticks = 'outside', showline=True, linecolor='black')

# Plot figure
plot(fig, filename = 'fig2.html', config = config)
display(HTML('fig2.html'))

7.2 Applying the transformations

Having made the previous analysis we are ready to apply the following dimensionality reduction algorithms:

  1. Principal Component Analysis(PCA)
  2. Sparse Principal Component Analysis(SparsePCA)
  3. Singular Value Decomposition(SVD)
  4. Isomap

From the plot above it is obvious that we need 86 components to preserve 99% of the inital variance of the data.

pca_pipeline = Pipeline([
                        ('IQR_removal',IQROutlierRemoval(num_att = num_att)),
                        ('std_scaler', StandardScalerIndices()),
                        ('yeo-johnson',PowerTransformerIndices(method='yeo-johnson')),
                        ('z_score_removal', ZScoreOutlierRemoval(num_att = num_att)),
                        ('Imputer', IterativeImputerIndices(estimator = ExtraTreesRegressor(n_estimators=10, random_state=0))),
                        ('join', JoinTransformer(x_train, num_att)), #kategoriski so numericki
                        ('encoder', CategoricalEncoder()),
                        ('pca', decomposition.PCA(n_components = 86))
                        ])

spca_pipeline = Pipeline([
                        ('IQR_removal',IQROutlierRemoval(num_att = num_att)),
                        ('std_scaler', StandardScalerIndices()),
                        ('yeo-johnson',PowerTransformerIndices(method='yeo-johnson')),
                        ('z_score_removal', ZScoreOutlierRemoval(num_att = num_att)),
                        ('Imputer', IterativeImputerIndices(estimator = ExtraTreesRegressor(n_estimators=10, random_state=0))),
                        ('join', JoinTransformer(x_train, num_att)), #kategoriski so numericki
                        ('encoder', CategoricalEncoder()),
                        ('pca', decomposition.SparsePCA())
                        ])

svd_pipeline = Pipeline([
                        ('IQR_removal',IQROutlierRemoval(num_att = num_att)),
                        ('std_scaler', StandardScalerIndices()),
                        ('yeo-johnson',PowerTransformerIndices(method='yeo-johnson')),
                        ('z_score_removal', ZScoreOutlierRemoval(num_att = num_att)),
                        ('Imputer', IterativeImputerIndices(estimator = ExtraTreesRegressor(n_estimators=10, random_state=0))),
                        ('join', JoinTransformer(x_train, num_att)), #kategoriski so numericki
                        ('encoder', CategoricalEncoder()),
                        ('svd', TruncatedSVD(n_components = 86))
                        ])

isomap_pipeline = Pipeline([
                        ('IQR_removal',IQROutlierRemoval(num_att = num_att)),
                        ('std_scaler', StandardScalerIndices()),
                        ('yeo-johnson',PowerTransformerIndices(method='yeo-johnson')),
                        ('z_score_removal', ZScoreOutlierRemoval(num_att = num_att)),
                        ('Imputer', IterativeImputerIndices(estimator = ExtraTreesRegressor(n_estimators=10, random_state=0))),
                        ('join', JoinTransformer(x_train, num_att)), #kategoriski so numericki
                        ('encoder', CategoricalEncoder()),
                        ('isomap', Isomap(n_components = 60))
                        ])

X_train_spca = pd.DataFrame(spca_pipeline.fit_transform(x_train))
X_train_pca = pd.DataFrame(pca_pipeline.fit_transform(x_train))
y_train_pca = y_train.reset_index()
y_train_pca = y_train_pca.drop('index', axis = 1)

X_train_svd = pd.DataFrame(svd_pipeline.fit_transform(x_train))
y_train_svd = y_train_pca.copy()

X_train_isomap = pd.DataFrame(isomap_pipeline.fit_transform(x_train))
y_train_isomap = y_train_pca.copy()
E:\Users\Filip\Anaconda\lib\site-packages\ipykernel_launcher.py:8: RuntimeWarning:

invalid value encountered in less

E:\Users\Filip\Anaconda\lib\site-packages\ipykernel_launcher.py:8: RuntimeWarning:

invalid value encountered in less

E:\Users\Filip\Anaconda\lib\site-packages\ipykernel_launcher.py:8: RuntimeWarning:

invalid value encountered in less

E:\Users\Filip\Anaconda\lib\site-packages\ipykernel_launcher.py:8: RuntimeWarning:

invalid value encountered in less

8. Additional Feature Engineering

Here, we will try to improve the data quality by grouping the equipment features. This will result in reducing the binary features in our data, thus reducing the data sparcity. The equipment is being classified in the following groups:

  1. Safety Equipment
  2. Luxurious Equipment
  3. General Equipment
  4. Sensors
  5. Lights

The same methods of dimensionality reduction are being applied to this newly formed dataset.

safety = ['ABS','Traction control','Driver-side airbag','Side airbag','Passenger-side airbag','Isofix','Immobilizer']
luxury = ['Adaptive Cruise Control','Leather steering wheel','Massage seats','Heated steering wheel','Panorama roof','Touch screen',
          'Keyless central door lock','Electrically heated windshield','Alloy wheels','Sunroof','Electrically adjustable seats',
         'Navigation system']
general = ['Multi-function steering wheel','Air suspension','Hill Holder','USB','Non-smoking Vehicle','Air conditioning','Automatic climate control','Radio','Bluetooth', 'CD player',
          'Power windows','Central door lock','On-board computer','Alarm system', 'Trailer hitch', 'Ski bag','MP3','Digital radio',
          'Armrest','Power steering','Electrical side mirrors','Roof rack']
sensors = ['Parking assist system sensors rear','Parking assist system sensors front','Night view assist', 'Blind spot monitor',  'Parking assist system camera'
          , 'Parking assist system self-steering','Lane departure warning system','Traffic sign recognition',
           'Electronic stability control','Tire pressure monitoring system','Electric tailgate','Rain sensor','Start-stop system']
lights = ['LED Daytime Running Lights','LED Headlights','Adaptive headlights','Daytime running lights','Xenon headlights','Fog lights']
recnik2 = {'luxury':luxury, 'safety':safety, 'general':general, 'sensors':sensors, 'lights':lights}




num_att_new = ['First Registration',
 'Mileage',
 'Power(hp)',
 'Displacement', 'luxury', 'safety', 'general', 'lights', 'sensors']
x_train_red = x_train.copy()
for key in recnik2.keys():
    x_train_red
    x_train_red[key] = 0
    for obj in recnik2[key]:
        x_train_red[key] += x_train_red[obj]
        x_train_red = x_train_red.drop(obj, axis = 1)

        

spca_pipeline_reduced = Pipeline([
                        ('IQR_removal',IQROutlierRemoval(num_att = num_att_new)),
                        ('std_scaler', StandardScalerIndices()),
                        ('yeo-johnson',PowerTransformerIndices(method='yeo-johnson')),
                        ('z_score_removal', ZScoreOutlierRemoval(num_att = num_att_new)),
                        ('Imputer', IterativeImputerIndices(estimator = ExtraTreesRegressor(n_estimators=10, random_state=0))),
                        ('join', JoinTransformer(x_train_red, num_att_new)), #kategoriski so numericki
                        ('encoder', CategoricalEncoder()),
                        #('pca', decomposition.SparsePCA())
                        ])


reduced_x = pd.DataFrame(spca_pipeline_reduced.fit_transform(x_train_red))
joined = reduced_x.join(y_train)
reduced_y = joined['Price']
reduced_reset_y = reduced_y.reset_index()
reduced_reset_y = reduced_reset_y.drop('index', axis = 1 )
E:\Users\Filip\Anaconda\lib\site-packages\ipykernel_launcher.py:8: RuntimeWarning:

invalid value encountered in less

E:\Users\Filip\Anaconda\lib\site-packages\ipykernel_launcher.py:8: RuntimeWarning:

invalid value encountered in less

E:\Users\Filip\Anaconda\lib\site-packages\ipykernel_launcher.py:8: RuntimeWarning:

invalid value encountered in less

E:\Users\Filip\Anaconda\lib\site-packages\ipykernel_launcher.py:8: RuntimeWarning:

invalid value encountered in less

Choosing the right number of components

Done using the same method as before, keeping the variance as close to the initial value

svd = TruncatedSVD()
svd.n_components = 60
svd_data = svd.fit_transform(reduced_x,reduced_y)
percentage_var_explained_svd = svd.explained_variance_ / np.sum(svd.explained_variance_)
cum_var_explained_svd = np.cumsum(percentage_var_explained_svd)

pca = decomposition.PCA()
pca.n_components = 60
pca_data = pca.fit_transform(reduced_x,reduced_y)
percentage_var_explained = pca.explained_variance_ / np.sum(pca.explained_variance_)
cum_var_explained = np.cumsum(percentage_var_explained)



fig = go.Figure()
fig.add_trace(go.Scatter(y = cum_var_explained,  name = 'PCA'))
fig.add_trace(go.Scatter(y = cum_var_explained_svd,  name = 'SVD'))

fig.update_layout(plot_bgcolor='rgb(255,255,255)',xaxis_title="n_components",
    yaxis_title='Variance')
fig.update_xaxes(ticks = 'outside', showline=True, linecolor='black')
fig.update_yaxes(ticks = 'outside', showline=True, linecolor='black')

# Plot figure
plot(fig, filename = 'fig10.html', config = config)
display(HTML('fig10.html'))

Applying the transformations

spca_pipeline_reduced2 = Pipeline([
                        ('IQR_removal',IQROutlierRemoval(num_att = num_att_new)),
                        ('std_scaler', StandardScalerIndices()),
                        ('yeo-johnson',PowerTransformerIndices(method='yeo-johnson')),
                        ('z_score_removal', ZScoreOutlierRemoval(num_att = num_att_new)),
                        ('Imputer', IterativeImputerIndices(estimator = ExtraTreesRegressor(n_estimators=10, random_state=0))),
                        ('join', JoinTransformer(x_train_red, num_att_new)), #kategoriski so numericki
                        ('encoder', CategoricalEncoder()),
                        ('pca', decomposition.SparsePCA())
                        ])

pca_pipeline_reduced = Pipeline([
                        ('IQR_removal',IQROutlierRemoval(num_att = num_att_new)),
                        ('std_scaler', StandardScalerIndices()),
                        ('yeo-johnson',PowerTransformerIndices(method='yeo-johnson')),
                        ('z_score_removal', ZScoreOutlierRemoval(num_att = num_att_new)),
                        ('Imputer', IterativeImputerIndices(estimator = ExtraTreesRegressor(n_estimators=10, random_state=0))),
                        ('join', JoinTransformer(x_train_red, num_att_new)), #kategoriski so numericki
                        ('encoder', CategoricalEncoder()),
                        ('pca', decomposition.PCA(n_components = 34))
                        ])


svd_pipeline_reduced = Pipeline([
                        ('IQR_removal',IQROutlierRemoval(num_att = num_att_new)),
                        ('std_scaler', StandardScalerIndices()),
                        ('yeo-johnson',PowerTransformerIndices(method='yeo-johnson')),
                        ('z_score_removal', ZScoreOutlierRemoval(num_att = num_att_new)),
                        ('Imputer', IterativeImputerIndices(estimator = ExtraTreesRegressor(n_estimators=10, random_state=0))),
                        ('join', JoinTransformer(x_train_red, num_att_new)), #kategoriski so numericki
                        ('encoder', CategoricalEncoder()),
                        ('SVD', TruncatedSVD(n_components = 34))
                        ])
reduced_x_spca = pd.DataFrame(spca_pipeline_reduced2.fit_transform(x_train_red))
reduced_x_svd = pd.DataFrame(svd_pipeline_reduced.fit_transform(x_train_red))
reduced_x_pca = pd.DataFrame(pca_pipeline_reduced.fit_transform(x_train_red))

9. Label Encoding instead of One Hot Encoding

When encoding the categorical, independent(input) variables of the data we mainly use the One Hot Encoding techinque where each newly created variable represents one level of the categorical feature. 0 represents absence while 1 represents the presence of that category. We use this approach when handling nominal data, where the categories do not have an inherent order. On the other hand, by using Label Encoder we are imposing ordinality in the data, thus making some levels of the categorical feature more important than others. Unsurprisingly, this might affect some algorithms negatively.

However, when using Tree-Based emseble models this might be just the opposite. These types of algorithms work well with categorical features and there is no difference whether the features are ordinal or nominal.This is because the algorithms does not take the ordinality of the categorical features into account.

final_pipeline = Pipeline([
                        ('IQR_removal',IQROutlierRemoval(num_att = num_att)),
                        ('std_scaler', StandardScalerIndices()),
                        ('yeo-johnson',PowerTransformerIndices(method='yeo-johnson')),
                        ('z_score_removal', ZScoreOutlierRemoval(num_att = num_att)),
                        ('Imputer', IterativeImputerIndices(estimator = ExtraTreesRegressor(n_estimators=10, random_state=0))),
                        ('join', JoinTransformer(x_train, num_att)), #kategoriski so numericki
                        
                        

                        ])


X_train_label = pd.DataFrame(final_pipeline.fit_transform(x_train))


cat = [ 'Make','Model', 'Fuel', 'Body', 'Gearing Type']
encoder = LabelEncoder()
for c in cat:
    X_train_label[c] = encoder.fit_transform(X_train_label[c])

spca = decomposition.SparsePCA()
X_train_label_sparse = pd.DataFrame(spca.fit_transform(X_train_label)) 

10. Testing different models

10.1 Testing the different data sets with RandomForestRegressor model

10.1.1 Transformations made:

  1. Removing outliers with the Interquartile range method
  2. Standardizing data
  3. Normalizing the data using the yeo-johnosn power transfromation
  4. Removing outliers with the Z Score method
  5. Imputing missing values
  6. Encoding cateorical variables using One Hot Encoding
grid_no_dim_red = gridSearch(param_grid = {'max_features' : [90,100,110,'auto','sqrt','log2'], 'n_estimators' : [100] }, 
                             model = RandomForestRegressor(random_state = 2), X = X_train_prepared, y = y_train)
randomforestCV(max_features = grid_no_dim_red.best_params_['max_features'], n_estimators = grid_no_dim_red.best_params_['n_estimators'],
               X = X_train_prepared, y = y_train, message = 'Without dim reduction')
Fitting 5 folds for each of 3 candidates, totalling 15 fits
0.1776572804812721 {'max_features': 90, 'n_estimators': 100}
0.17552217006843734 {'max_features': 100, 'n_estimators': 100}
0.17290840059188065 {'max_features': 110, 'n_estimators': 100}

10.1.2 Transformations made:

  1. Removing outliers with the Interquartile range method
  2. Standardizing data
  3. Normalizing the data using the yeo-johnosn power transfromation
  4. Removing outliers with the Z Score method
  5. Imputing missing values
  6. Encoding cateorical variables using One Hot Encoding
  7. Principal Component Analysis
grid_pca = gridSearch(param_grid = {'n_estimators':[100], 'max_features' : [50,60,70,'auto', 'sqrt', 'log2']}, 
                             model = RandomForestRegressor(random_state =2), X = X_train_pca, y = y_train_pca)
randomforestCV(max_features = grid_pca.best_params_['max_features'], 
               n_estimators = grid_pca.best_params_['n_estimators'],
               X = X_train_pca, y = y_train_pca, message = 'PCA applied')
Fitting 5 folds for each of 6 candidates, totalling 30 fits
E:\Users\Filip\Anaconda\lib\site-packages\sklearn\model_selection\_search.py:880: DataConversionWarning:

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().

0.2632499409622097 {'max_features': 50, 'n_estimators': 100}
0.26498974903189654 {'max_features': 60, 'n_estimators': 100}
0.25665490517863254 {'max_features': 70, 'n_estimators': 100}
0.26201969408428416 {'max_features': 'auto', 'n_estimators': 100}
0.3902310756225239 {'max_features': 'sqrt', 'n_estimators': 100}
0.43990704437638756 {'max_features': 'log2', 'n_estimators': 100}
E:\Users\Filip\Anaconda\lib\site-packages\ipykernel_launcher.py:7: DataConversionWarning:

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().

Scores: [0.30631916 0.23482107 0.22539615 0.32499212 0.19174603]
Mean: 0.25665490517863254
Standard deviation: 0.050603362416815095

10.1.3 Transformations made:

  1. Removing outliers with the Interquartile range method
  2. Standardizing data
  3. Normalizing the data using the yeo-johnosn power transfromation
  4. Removing outliers with the Z Score method
  5. Imputing missing values
  6. Encoding cateorical variables using One Hot Encoding
  7. Sparse Principal Component Analysis
grid_spca = gridSearch(param_grid = {'n_estimators':[100], 'max_features' : [50,60,70,80, 'auto','sqrt','log2']}, 
                             model = RandomForestRegressor(random_state =2), X = X_train_spca, y = y_train_pca)
randomforestCV(max_features = grid_spca.best_params_['max_features'], 
               n_estimators = grid_spca.best_params_['n_estimators'],
               X = X_train_spca, y = y_train_pca, message = 'SparsePCA applied')
Fitting 5 folds for each of 7 candidates, totalling 35 fits
E:\Users\Filip\Anaconda\lib\site-packages\sklearn\model_selection\_search.py:880: DataConversionWarning:

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().

0.18999751654181588 {'max_features': 50, 'n_estimators': 100}
0.1868007961519182 {'max_features': 60, 'n_estimators': 100}
0.1839385185633903 {'max_features': 70, 'n_estimators': 100}
0.17833338438845006 {'max_features': 80, 'n_estimators': 100}
0.17134234990247948 {'max_features': 'auto', 'n_estimators': 100}
0.24018857292185974 {'max_features': 'sqrt', 'n_estimators': 100}
0.27950399459533326 {'max_features': 'log2', 'n_estimators': 100}
E:\Users\Filip\Anaconda\lib\site-packages\ipykernel_launcher.py:7: DataConversionWarning:

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().

Scores: [0.17735928 0.14917622 0.18123541 0.20427244 0.14466839]
Mean: 0.17134234990247948
Standard deviation: 0.022005927211309895

10.1.4 Transformations made:

  1. Removing outliers with the Interquartile range method
  2. Standardizing data
  3. Normalizing the data using the yeo-johnosn power transfromation
  4. Removing outliers with the Z Score method
  5. Imputing missing values
  6. Encoding cateorical variables using One Hot Encoding
  7. Singular Value Decomposition
grid_svd = gridSearch(param_grid = {'n_estimators':[100], 'max_features' : [40,50,60,70, 'auto','sqrt','log2']}, 
                             model = RandomForestRegressor(random_state = 2), X = X_train_svd, y = y_train_svd.values.ravel())
randomforestCV(max_features = grid_svd.best_params_['max_features'], 
               n_estimators = grid_svd.best_params_['n_estimators'],
               X = X_train_svd, y = y_train_svd.values.ravel(), message = 'SVD applied')
Fitting 5 folds for each of 7 candidates, totalling 35 fits
0.2591085380020529 {'max_features': 40, 'n_estimators': 100}
0.25149058977056943 {'max_features': 50, 'n_estimators': 100}
0.2471912109167899 {'max_features': 60, 'n_estimators': 100}
0.24969303787892233 {'max_features': 70, 'n_estimators': 100}
0.24595220203036372 {'max_features': 'auto', 'n_estimators': 100}
0.3643364373467085 {'max_features': 'sqrt', 'n_estimators': 100}
0.41757278199511016 {'max_features': 'log2', 'n_estimators': 100}
Scores: [0.29666501 0.20550214 0.20697858 0.31826883 0.20234646]
Mean: 0.24595220203036372
Standard deviation: 0.0507111323270761

10.1.5 Transformations made:

  1. Grouping the categorical features
  2. Removing outliers with the Interquartile range method
  3. Standardizing data
  4. Normalizing the data using the yeo-johnosn power transfromation
  5. Removing outliers with the Z Score method
  6. Imputing missing values
  7. Encoding cateorical variables using One Hot Encoding
grid_reduced = gridSearch(param_grid = {'n_estimators':[100], 'max_features' : [15,20,25,30,40,50,'auto','sqrt','log2']}, 
                             model = RandomForestRegressor(random_state =2), X = reduced_x, y = reduced_y)
randomforestCV(max_features = grid_reduced.best_params_['max_features'], 
               n_estimators = grid_reduced.best_params_['n_estimators'],
                X = reduced_x, y = reduced_y, message = 'Reduced Data Set w/o dim reduction')
Fitting 5 folds for each of 9 candidates, totalling 45 fits
0.20827315588670636 {'max_features': 15, 'n_estimators': 100}
0.20726515933441392 {'max_features': 20, 'n_estimators': 100}
0.202848650916612 {'max_features': 25, 'n_estimators': 100}
0.20961127089529463 {'max_features': 30, 'n_estimators': 100}
0.20874556563507535 {'max_features': 40, 'n_estimators': 100}
0.20552685333932125 {'max_features': 50, 'n_estimators': 100}
0.20584322191859186 {'max_features': 'auto', 'n_estimators': 100}
0.22151144497992128 {'max_features': 'sqrt', 'n_estimators': 100}
0.22949370196190375 {'max_features': 'log2', 'n_estimators': 100}
Scores: [0.23672151 0.17744192 0.20077917 0.23578265 0.163518  ]
Mean: 0.202848650916612
Standard deviation: 0.029761396756656674

10.1.6 Transformations made:

  1. Grouping the categorical features
  2. Removing outliers with the Interquartile range method
  3. Standardizing data
  4. Normalizing the data using the yeo-johnosn power transfromation
  5. Removing outliers with the Z Score method
  6. Imputing missing values
  7. Encoding cateorical variables using One Hot Encoding
  8. Principal Component Analysis
grid_reduced_PCA = gridSearch(param_grid = {'n_estimators':[100], 'max_features' : [10,15,20,25,30,'auto','sqrt','log2']}, 
                             model = RandomForestRegressor(random_state = 2), X = reduced_x_pca, y = reduced_reset_y.values.ravel())
randomforestCV(max_features = grid_reduced_PCA.best_params_['max_features'], 
               n_estimators = grid_reduced_PCA.best_params_['n_estimators'],
                X = reduced_x_pca, y = reduced_reset_y.values.ravel(), message = 'Reduced Data Set + PCA')
Fitting 5 folds for each of 8 candidates, totalling 40 fits
0.23977983744751255 {'max_features': 10, 'n_estimators': 100}
0.23545713154856712 {'max_features': 15, 'n_estimators': 100}
0.23089488579114587 {'max_features': 20, 'n_estimators': 100}
0.23438742839516474 {'max_features': 25, 'n_estimators': 100}
0.23064242533075358 {'max_features': 30, 'n_estimators': 100}
0.232256773641396 {'max_features': 'auto', 'n_estimators': 100}
0.25620055764171856 {'max_features': 'sqrt', 'n_estimators': 100}
0.25620055764171856 {'max_features': 'log2', 'n_estimators': 100}
Scores: [0.27325944 0.23175406 0.18939139 0.28359633 0.17521091]
Mean: 0.23064242533075358
Standard deviation: 0.043349422556213775

10.1.7 Transformations made:

  1. Grouping the categorical features
  2. Removing outliers with the Interquartile range method
  3. Standardizing data
  4. Normalizing the data using the yeo-johnosn power transfromation
  5. Removing outliers with the Z Score method
  6. Imputing missing values
  7. Encoding cateorical variables using One Hot Encoding
  8. Sparse Principal Component Analysis
grid_reduced_SPCA = gridSearch(param_grid = {'n_estimators':[100], 'max_features' : [10,15,20,21,'auto','sqrt','log2',40]}, 
                             model = RandomForestRegressor(random_state = 2), X = reduced_x_spca, y = reduced_reset_y.values.ravel())
randomforestCV(max_features = grid_reduced_SPCA.best_params_['max_features'], 
               n_estimators = grid_reduced_SPCA.best_params_['n_estimators'],
                X = reduced_x_spca, y = reduced_reset_y.values.ravel(), message = 'Reduced Data Set + SparsePCA')
Fitting 5 folds for each of 8 candidates, totalling 40 fits
0.21730245336373105 {'max_features': 10, 'n_estimators': 100}
0.21312662732149512 {'max_features': 15, 'n_estimators': 100}
0.2096852786250202 {'max_features': 20, 'n_estimators': 100}
0.2101059294139272 {'max_features': 21, 'n_estimators': 100}
0.2121605533930075 {'max_features': 'auto', 'n_estimators': 100}
0.22474391238642336 {'max_features': 'sqrt', 'n_estimators': 100}
0.22856471455386237 {'max_features': 'log2', 'n_estimators': 100}
0.21212232761630542 {'max_features': 40, 'n_estimators': 100}
Scores: [0.23371654 0.17076633 0.19241571 0.26767448 0.18385333]
Mean: 0.2096852786250202
Standard deviation: 0.03583423390282167

10.1.8 Transformations made:

  1. Grouping the categorical features
  2. Removing outliers with the Interquartile range method
  3. Standardizing data
  4. Normalizing the data using the yeo-johnosn power transfromation
  5. Removing outliers with the Z Score method
  6. Imputing missing values
  7. Encoding cateorical variables using One Hot Encoding
  8. Singular Value Decomposition
grid_reduced_svd = gridSearch(param_grid = {'n_estimators':[100], 'max_features' : [10,15,20,25,30,'auto','sqrt','log2']}, 
                             model = RandomForestRegressor(random_state =42), X = reduced_x_svd, y = reduced_reset_y.values.ravel())
randomforestCV(max_features = grid_reduced_svd.best_params_['max_features'], 
               n_estimators = grid_reduced_svd.best_params_['n_estimators'],
                X = reduced_x_svd, y = reduced_reset_y.values.ravel(), message = 'Reduced Data Set + SVD')
Fitting 5 folds for each of 8 candidates, totalling 40 fits
0.22933369786004162 {'max_features': 10, 'n_estimators': 100}
0.2269752493361814 {'max_features': 15, 'n_estimators': 100}
0.2227976041176772 {'max_features': 20, 'n_estimators': 100}
0.2257904842292962 {'max_features': 25, 'n_estimators': 100}
0.22783788953210418 {'max_features': 30, 'n_estimators': 100}
0.2321367570877284 {'max_features': 'auto', 'n_estimators': 100}
0.25367161813333 {'max_features': 'sqrt', 'n_estimators': 100}
0.25367161813333 {'max_features': 'log2', 'n_estimators': 100}
Scores: [0.25749898 0.21469708 0.19729628 0.27671054 0.1686053 ]
Mean: 0.2229616366809338
Standard deviation: 0.039404053356886126

10.1.9 Transformations made:

  1. Grouping the categorical features
  2. Removing outliers with the Interquartile range method
  3. Standardizing data
  4. Normalizing the data using the yeo-johnosn power transfromation
  5. Removing outliers with the Z Score method
  6. Imputing missing values
  7. Encoding cateorical variables using Label Encoder
rfr = RandomForestRegressor(random_state = 2,max_depth=17)
rfr.fit(X_train_label,y_train)
rfr_scores = cross_val_score(rfr, X_train_label, y_train, scoring = 'neg_mean_absolute_percentage_error', 
                             cv = 5, n_jobs = -1)
recnik['Method'].append('Label Encoded w/o Dimenisonality Reduction')
recnik['Mean Percentage Error'].append(-rfr_scores.mean())
recnik['Standard Deviation'].append(rfr_scores.std())

10.2 XGBRegressor

xgb = XGBRegressor(random_state = 2) 
xgb.fit(X_train_prepared, y_train)
xgb_scores = cross_val_score(xgb, X_train_prepared, y_train, scoring = 'neg_mean_absolute_percentage_error', cv = 5, n_jobs = -1)

print("Scores:", -xgb_scores)
print("Mean:", -xgb_scores.mean())
print("Standard deviation:", xgb_scores.std())
recnik['Method'].append('XGBRegressor w/o dim reduction')
recnik['Mean Percentage Error'].append(-xgb_scores.mean())
recnik['Standard Deviation'].append(xgb_scores.std())
Scores: [0.19700077 0.2024891  0.17091389 0.19927036 0.16294556]
Mean: 0.186523937005948
Standard deviation: 0.01628947737014463

10.3 Ensemble Regressor consisting of XGBRegressor and RandomForest

Results without using any dimensionality reduction algorithm

from sklearn.ensemble import VotingRegressor

vote_mod = VotingRegressor([ ('XGBRegressor', XGBRegressor()), 
                            ('RandomForest', RandomForestRegressor(random_state = 2,max_depth=17))])
vote_mod.fit(X_train_label, y_train)
reg_scores = cross_val_score(vote_mod, X_train_label, y_train, scoring = 'neg_mean_absolute_percentage_error', 
                             cv = 5, n_jobs = -1)


print("Scores:", -reg_scores)
print("Mean:", -reg_scores.mean())
print("Standard deviation:", reg_scores.std())

recnik['Method'].append('Voting Regressor w/o dim reduction')
recnik['Mean Percentage Error'].append(-reg_scores.mean())
recnik['Standard Deviation'].append(reg_scores.std())

Results using Sparse Principal Component Analysis

reg_scores = cross_val_score(vote_mod, X_train_label_sparse, y_train_pca.values.ravel(), scoring = 'neg_mean_absolute_percentage_error', 
                             cv = 5)
print("Scores:", -reg_scores)
print("Mean:", -reg_scores.mean())
print("Standard deviation:", reg_scores.std())

recnik['Method'].append('Voting Regressor + SparsePCA')
recnik['Mean Percentage Error'].append(-reg_scores.mean())
recnik['Standard Deviation'].append(reg_scores.std())
 

11. Results

pd.DataFrame.from_dict(recnik)
Method Mean Percentage Error Standard Deviation
0 Without dim reduction 0.172039 0.023894
1 PCA applied 0.256655 0.050603
2 SparsePCA applied 0.171342 0.022006
3 SVD applied 0.245952 0.050711
4 Reduced Data Set w/o dim reduction 0.202849 0.029761
5 Reduced Data Set + PCA 0.230642 0.043349
6 Reduced Data Set + SparsePCA 0.209685 0.035834
7 Reduced Data Set + SVD 0.222962 0.039404
8 XGBRegressor w/o dim reduction 0.186524 0.016289
9 GradientBoostingRegressor + Hyperparameter tuning 0.228590 0.025933
10 Label Encoded w/o Dimenisonality Reduction 0.170918 0.018696
11 Label Encoded w/o Dimenisonality Reduction 0.169916 0.019221
12 Voting Regressor w/o dim reduction 0.169184 0.016196
13 Voting Regressor + SparsePCA 0.169161 0.016898